The data for comes from the Elements of Statistical Learning textbook; and is originally from a study by Stamey et al. (1989) in which they examined the relationship between the level of prostate-specific antigen (psa) and a number of clinical measures in men who were about to receive a prostatectomy. The variables are as follows:
These data are available in prostate.csv. Let’s start by
reading in the data and some basic EDA.
prostate = read.csv(".././data/prostate.csv")
head(prostate)
## lcavol lweight age lbph svi lcp gleason pgg45 lpsa
## 1 -0.5798185 2.769459 50 -1.386294 0 -1.386294 6 0 -0.4307829
## 2 -0.9942523 3.319626 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 3 -0.5108256 2.691243 74 -1.386294 0 -1.386294 7 20 -0.1625189
## 4 -1.2039728 3.282789 58 -1.386294 0 -1.386294 6 0 -0.1625189
## 5 0.7514161 3.432373 62 -1.386294 0 -1.386294 6 0 0.3715636
## 6 -1.0498221 3.228826 50 -1.386294 0 -1.386294 6 0 0.7654678
## train
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
summary(prostate)
## lcavol lweight age lbph
## Min. :-1.3471 Min. :2.375 Min. :41.00 Min. :-1.3863
## 1st Qu.: 0.5128 1st Qu.:3.376 1st Qu.:60.00 1st Qu.:-1.3863
## Median : 1.4469 Median :3.623 Median :65.00 Median : 0.3001
## Mean : 1.3500 Mean :3.629 Mean :63.87 Mean : 0.1004
## 3rd Qu.: 2.1270 3rd Qu.:3.876 3rd Qu.:68.00 3rd Qu.: 1.5581
## Max. : 3.8210 Max. :4.780 Max. :79.00 Max. : 2.3263
## svi lcp gleason pgg45
## Min. :0.0000 Min. :-1.3863 Min. :6.000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:-1.3863 1st Qu.:6.000 1st Qu.: 0.00
## Median :0.0000 Median :-0.7985 Median :7.000 Median : 15.00
## Mean :0.2165 Mean :-0.1794 Mean :6.753 Mean : 24.38
## 3rd Qu.:0.0000 3rd Qu.: 1.1787 3rd Qu.:7.000 3rd Qu.: 40.00
## Max. :1.0000 Max. : 2.9042 Max. :9.000 Max. :100.00
## lpsa train
## Min. :-0.4308 Mode :logical
## 1st Qu.: 1.7317 FALSE:30
## Median : 2.5915 TRUE :67
## Mean : 2.4784
## 3rd Qu.: 3.0564
## Max. : 5.5829
# Visualize the data (except the last column indicating the test/train split)
prostate_gg = prostate
prostate_gg$svi = as.factor(prostate_gg$svi)
ggpairs(prostate_gg, columns = 1:(ncol(prostate)-1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There are \(N=97\) observations with no missingness. We observe that:
age and ppg45 are integer
valued, with latter representing a percent and restricted to the range
\([0,100]\).svi is binary and the variable
gleason is an ordered categorical variable. -The other
variables are numerical.We see that both lcavol and lcp seem to
have a linear relationship with the response lpsa.
# Split the data
train_df = data.frame(prostate[prostate$train==1,])
train_df = select(train_df, !c(train))
test_df = data.frame(prostate[prostate$train==0,])
test_df = select(test_df, !c(train))
N = dim(train_df)[1] #number of data points
D = dim(train_df) [2] -1 #number of features
Nnew = dim(test_df)[1]
print(paste("Number of data points =",N, " and number of features =", D))
## [1] "Number of data points = 67 and number of features = 8"
Let’s start by fitting a baseline linear regression model.
First, we will scale the inputs.
# Scale the data
train.m = apply(train_df, 2, mean)
train.s = apply(train_df,2,sd)
train_df = (train_df - matrix(train.m,N,D+1,byrow = TRUE))/matrix(train.s,N,D+1,byrow = TRUE)
test_df = (test_df - matrix(train.m,Nnew,D+1,byrow = TRUE))/matrix(train.s,Nnew,D+1,byrow = TRUE)
Fit the linear regression model and visualize the coefficients.
# Fit lm
lm_prostate = lm(lpsa~., data=train_df)
summary(lm_prostate)
##
## Call:
## lm(formula = lpsa ~ ., data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.36503 -0.28272 -0.04491 0.37209 1.23095
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.200e-16 7.205e-02 0.000 1.00000
## lcavol 5.931e-01 1.105e-01 5.366 1.47e-06 ***
## lweight 2.423e-01 8.808e-02 2.751 0.00792 **
## age -1.180e-01 8.455e-02 -1.396 0.16806
## lbph 1.755e-01 8.538e-02 2.056 0.04431 *
## svi 2.563e-01 1.038e-01 2.469 0.01651 *
## lcp -2.393e-01 1.282e-01 -1.867 0.06697 .
## gleason -1.732e-02 1.180e-01 -0.147 0.88389
## pgg45 2.296e-01 1.321e-01 1.738 0.08755 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5897 on 58 degrees of freedom
## Multiple R-squared: 0.6944, Adjusted R-squared: 0.6522
## F-statistic: 16.47 on 8 and 58 DF, p-value: 2.042e-12
tidy_reg_conf_int = tidy(lm_prostate,conf.int =TRUE)
tidy_reg_conf_int %>%
filter(term != "(Intercept)") %>%
# reorder the coefficients so that the largest is at the top of the plot
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
# add in a dotted line at zero
geom_vline(xintercept = 0, lty = 2) +
labs(
x = "Estimate of effect of variable on lpsa",
y = NULL,
title = "Coefficient plot with error bars") +
theme_bw()
Compute the RMSE on the test data
yhat = predict(lm_prostate, test_df)
rmse_lm = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the linear model is ",rmse_lm))
## [1] "The RMSE for the linear model is 0.597769431021197"
Now, we will use glmnet to fit a lasso model. First, we
need to use cross-validation to tune \(\lambda\).
set.seed (1)
X = as.matrix(train_df[,1:D])
y = train_df$lpsa
cv.out=cv.glmnet(X, y, alpha=1, standardize = FALSE)
plot(cv.out)
bestlam=cv.out$lambda.min
print(bestlam)
## [1] 0.006281436
Now, let’s refit with the best lambda
glmnet.out=glmnet(X, y, alpha=1, lambda = bestlam, standardize = FALSE)
glmnet.out$beta
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## lcavol 0.5728689
## lweight 0.2390965
## age -0.1050754
## lbph 0.1683333
## svi 0.2434682
## lcp -0.1981421
## gleason .
## pgg45 0.1953291
Add the lasso estimates to our plot:
tidy_reg_conf_int = tidy(lm_prostate,conf.int =TRUE)
tidy_reg_conf_int$lasso = as.matrix(predict(glmnet.out,type="coefficients",s=bestlam))
tidy_reg_conf_int %>%
filter(term != "(Intercept)") %>%
# reorder the coefficients so that the largest is at the top of the plot
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
# add lasso estimates
geom_point(aes(lasso, term),shape = 2, color = 'blue') +
# add in a dotted line at zero
geom_vline(xintercept = 0, lty = 2) +
labs(
x = "Estimate of effect of variable on lpsa",
y = NULL,
title = "Coefficient plot with error bars") +
theme_bw()
Compute the RMSE on the test data
Xtest = as.matrix(test_df[,1:D])
yhat = predict(glmnet.out, s = bestlam, Xtest)
rmse_lasso = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the lasso is ",rmse_lasso))
## [1] "The RMSE for the lasso is 0.587015428770515"
Exercise: Try to repeat for ridge and elastic net and compare the coefficients and RMSE.
Run a simple Gibbs sampling algorithm.
# Define prior parameters
priors = list(s=1, a_sigma = 2, b_sigma = 1, s2_0 = 10)
I = 2000 #number of iterations
# Initialize
w_init = lm_prostate$coefficients
sigma_init = mean(lm_prostate$residuals^2)
#Run MCMC
blasso_out = gibbs_blasso(X,y, priors, I, w_init, sigma_init)
## [1] "Number of iterations completed: 100"
## [1] "Number of iterations completed: 200"
## [1] "Number of iterations completed: 300"
## [1] "Number of iterations completed: 400"
## [1] "Number of iterations completed: 500"
## [1] "Number of iterations completed: 600"
## [1] "Number of iterations completed: 700"
## [1] "Number of iterations completed: 800"
## [1] "Number of iterations completed: 900"
## [1] "Number of iterations completed: 1000"
## [1] "Number of iterations completed: 1100"
## [1] "Number of iterations completed: 1200"
## [1] "Number of iterations completed: 1300"
## [1] "Number of iterations completed: 1400"
## [1] "Number of iterations completed: 1500"
## [1] "Number of iterations completed: 1600"
## [1] "Number of iterations completed: 1700"
## [1] "Number of iterations completed: 1800"
## [1] "Number of iterations completed: 1900"
## [1] "Number of iterations completed: 2000"
Check mixing and determine burnin (note here we started with good initial values!)
# Traceplots
traceplot(as.mcmc(blasso_out$w))
traceplot(as.mcmc(blasso_out$tau))
traceplot(as.mcmc(blasso_out$sigma))
# Check other diagnostics in coda
# Remove burnin
b = 1000
blasso_out$w = blasso_out$w[b:I,]
blasso_out$tau = blasso_out$tau[b:I,]
blasso_out$sigma = blasso_out$sigma[b:I]
Let’s compare the coefficents
# Compute posterior mean and HPD intervals
bl_what = apply(blasso_out$w,2,mean)
bl_wci = apply(blasso_out$w,2,function(x){HPDinterval(as.mcmc(x))})
tidy_reg_conf_int$ls = lm_prostate$coefficients
tidy_reg_conf_int$estimate = bl_what
tidy_reg_conf_int$conf.low = bl_wci[1,]
tidy_reg_conf_int$conf.high = bl_wci[2,]
tidy_reg_conf_int %>%
filter(term != "(Intercept)") %>%
# reorder the coefficients so that the largest is at the top of the plot
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
# add lasso estimates
geom_point(aes(lasso, term),shape = 2, color = 'blue') +
geom_point(aes(ls, term),shape = 4, color = 'red') +
# add in a dotted line at zero
geom_vline(xintercept = 0, lty = 2) +
labs(
x = "Estimate of effect of variable on lpsa",
y = NULL,
title = "Coefficient plot with error bars") +
theme_bw()
Compute the RMSE:
yhat = cbind(rep(1,Nnew),Xtest)%*%bl_what
rmse_blasso = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the bayesian lasso is ",rmse_blasso))
## [1] "The RMSE for the bayesian lasso is 0.586549304005275"
You can also try to compare with the package bayesreg,
which implements a number of shrinkage priors.
library(bayesreg)
fit <- bayesreg(lpsa~., train_df, model = "gaussian", prior = "lasso", n.samples = 2000)
How do the results compare?
# Define prior parameters
traceplot(as.mcmc(t(fit$beta)))
traceplot(as.mcmc(t(fit$beta0)))
traceplot(as.mcmc(t(fit$sigma2)))
traceplot(as.mcmc(t(fit$tau2)))
# Check other diagnostics in coda
# Remove burnin
b = 1000
fit$beta = fit$beta[,b:I]
fit$beta0 = fit$beta0[b:I]
fit$sigma2 = fit$sigma2[b:I]
fit$tau2 = fit$tau2 [b:I]
# Compare scatter plot of samples from the two mcmc algorithms
coeff_names = names(train_df)[1:D]
ind1 = 1
ind2 = 6
df = data.frame(w1 = c(blasso_out$w[,ind1 +1], fit$beta[ind1,]), w2 = c(blasso_out$w[,ind2 +1], fit$beta[ind2,]),
mcmc = c(rep("Gibbs", dim(blasso_out$w)[1]), rep("HMC", dim(fit$beta)[2])))
ggplot(df) +
geom_point(aes(x = w1, y =w2, color = mcmc)) +
theme_bw() +
labs(x=coeff_names[ind1],y=coeff_names[ind2])